SCENIC Protocol - Case study - Selected cancer scRNA-seq data sets

Author: Bram Van de Sande

Date: 6 AUG 2019

Outline: SCENIC workflow for selected scRNAseq experiment on human cancer biopsies.

Accession ID Cancer type
GSE115978 SKCM
GSE103322 HNSC
E-MTAB-6149/E-MTAB-6653 LUSC, LUAD

Cancer type acronyms are taken from TCGA: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations

This notebook can be run in a miniconda environment which can be installed and configured as a Jupyter kernel via the following script: https://github.com/aertslab/pySCENIC/blob/master/miniconda_environment_installation.sh

In [1]:
import os, glob, re, pickle
from functools import partial
from collections import OrderedDict
import operator as op
from cytoolz import compose

import pandas as pd
import seaborn as sns
import numpy as np
import scanpy as sc
import anndata as ad
import matplotlib as mpl
import matplotlib.pyplot as plt

from pyscenic.export import export2loom, add_scenic_metadata
from pyscenic.utils import load_motifs
from pyscenic.transform import df2regulons
from pyscenic.aucell import aucell
from pyscenic.binarization import binarize
from pyscenic.rss import regulon_specificity_scores
from pyscenic.plotting import plot_binarization, plot_rss

from IPython.display import HTML, display
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/dask/config.py:161: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
  data = yaml.load(f.read()) or {}
In [2]:
# Set maximum number of jobs for Scanpy.
sc.settings.njobs = 32

Folder structure.

In [3]:
RESOURCES_FOLDERNAME = "/Users/bramvandesande/Projects/lcb/protocol/resources/"
AUXILLIARIES_FOLDERNAME = "/Users/bramvandesande/Projects/lcb/protocol/auxilliaries/"
RESULTS_FOLDERNAME = "/Users/bramvandesande/Projects/lcb/protocol/results/"
FIGURES_FOLDERNAME = "/Users/bramvandesande/Projects/lcb/protocol/figures/"
In [4]:
sc.settings.figdir = FIGURES_FOLDERNAME

Auxilliary functions.

In [5]:
BASE_URL = "http://motifcollections.aertslab.org/v9/logos/"
COLUMN_NAME_LOGO = "MotifLogo"
COLUMN_NAME_MOTIF_ID = "MotifID"
COLUMN_NAME_TARGETS = "TargetGenes"
In [6]:
def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.tight_layout()
    fig.savefig(os.path.join(folder, fname), format='svg')
In [7]:
def display_logos(df: pd.DataFrame, top_target_genes: int = 3, base_url: str = BASE_URL):
    """
    :param df:
    :param base_url:
    """
    # Make sure the original dataframe is not altered.
    df = df.copy()
    
    # Add column with URLs to sequence logo.
    def create_url(motif_id):
        return '<img src="{}{}.png" style="max-height:124px;"></img>'.format(base_url, motif_id)
    df[("Enrichment", COLUMN_NAME_LOGO)] = list(map(create_url, df.index.get_level_values(COLUMN_NAME_MOTIF_ID)))
    
    # Truncate TargetGenes.
    def truncate(col_val):
        return sorted(col_val, key=op.itemgetter(1))[:top_target_genes]
    df[("Enrichment", COLUMN_NAME_TARGETS)] = list(map(truncate, df[("Enrichment", COLUMN_NAME_TARGETS)]))
    
    MAX_COL_WIDTH = pd.get_option('display.max_colwidth')
    pd.set_option('display.max_colwidth', -1)
    display(HTML(df.head().to_html(escape=False)))
    pd.set_option('display.max_colwidth', MAX_COL_WIDTH)

Auxilliary data sets.

In [8]:
# Downloaded fromm pySCENIC github repo: https://github.com/aertslab/pySCENIC/tree/master/resources
HUMAN_TFS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'lambert2018.txt')
# Ranking databases. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
RANKING_DBS_FNAMES = list(map(lambda fn: os.path.join(AUXILLIARIES_FOLDERNAME, fn),
                       ['hg19-500bp-upstream-10species.mc9nr.feather',
                       'hg19-tss-centered-5kb-10species.mc9nr.feather',
                        'hg19-tss-centered-10kb-10species.mc9nr.feather']))
# Motif annotations. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
MOTIF_ANNOTATIONS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'motifs-v9-nr.hgnc-m0.001-o0.0.tbl')

GSE115978 - Cutaneous Melanoma (SKCM)

Publication: Jerby-Arnon L et al. A Cancer Cell Program Promotes T Cell Exclusion and Resistance to Checkpoint Blockade. Cell 2018 https://dx.doi.org/10.1016/j.cell.2018.09.006

Data set acquisition: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115978

Remarks:

  1. This data set includes samples from previous study (Itay Tirosh et al.).
  2. Codes:
Code Description
OR Objective Response
ICI Immune Checkpoint Inhibitor
In [9]:
DATASET_ID = "GSE115978"
TCGA_CODE = 'SKCM'

Resources downloaded.

In [10]:
# Downloaded from GEO on 28 FEB 2019.
CELL_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDERNAME, "GSE115978_cell.annotations.csv")
# Downloaded from Cell Journal website on 1 MAR 2019.
SAMPLE_METADATA_FNAME = os.path.join(RESOURCES_FOLDERNAME, "1-s2.0-S0092867418311784-mmc1.xlsx")
# Downloaded from GEO on 1 MAR 2019.
EXP_MTX_TPM_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'GSE115978_tpm.csv')
EXP_MTX_COUNTS_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'GSE115978_counts.csv')

Results created.

In [11]:
METADATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.metadata.csv'.format(DATASET_ID))
EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.qc.tpm.csv'.format(DATASET_ID))
ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}_{}.loom'.format(TCGA_CODE, DATASET_ID))

STEP 0: Preprocessing

METADATA CLEANING

In [12]:
df_annotations = pd.read_csv(CELL_ANNOTATIONS_FNAME)
df_annotations['samples'] = df_annotations['samples'].str.upper()
df_annotations.rename(columns={'cell.types': 'cell_type', 'cells': 'cell_id', 'samples': 'sample_id', 
                               'treatment.group': 'treatment_group', 'Cohort': 'cohort'}, inplace=True)
df_annotations['cell_type'] = df_annotations.cell_type.replace({
    'Mal': 'Malignant Cell',
    'Endo.': 'Endothelial Cell',
    'T.CD4': 'Thelper Cell',
    'CAF': 'Fibroblast',
    'T.CD8': 'Tcytotoxic Cell',
    'T.cell': 'T Cell',
    'NK': 'NK Cell',
    'B.cell': 'B Cell'})
df_samples = pd.read_excel(SAMPLE_METADATA_FNAME, header=2)
df_samples = df_samples.drop(['Cohort'], axis=1)
df_samples['Sample'] = df_samples.Sample.str.upper()
df_metadata = pd.merge(df_annotations, df_samples, left_on='sample_id', right_on='Sample')
df_metadata = df_metadata.drop(['Sample', 'treatment_group'], axis=1)
df_metadata.rename(columns={'Patient': 'patient_id',
                           'Age': 'age', 'Sex': 'sex', 'Lesion type': 'lesion_type', 'Site': 'site',
                           'Treatment': 'treatment', 'Treatment group': 'treatment_group'}, inplace=True)
df_metadata.to_csv(METADATA_FNAME, index=False)
df_metadata.head()
Out[12]:
cell_id sample_id cell_type cohort no.of.genes no.of.reads patient_id age sex treatment treatment_group lesion_type site
0 cy78_CD45_neg_1_B04_S496_comb MEL78 Malignant Cell Tirosh 8258 357919 Mel78 73 M WDVAX, ipilimumab + nivolumab Post-ICI (resistant) metastasis Small bowel
1 cy78_CD45_neg_3_H06_S762_comb MEL78 Malignant Cell Tirosh 7409 380341 Mel78 73 M WDVAX, ipilimumab + nivolumab Post-ICI (resistant) metastasis Small bowel
2 cy78_CD45_neg_1_D07_S523_comb MEL78 Malignant Cell Tirosh 8548 394632 Mel78 73 M WDVAX, ipilimumab + nivolumab Post-ICI (resistant) metastasis Small bowel
3 cy78_CD45_neg_3_D01_S709_comb MEL78 Malignant Cell Tirosh 8472 425938 Mel78 73 M WDVAX, ipilimumab + nivolumab Post-ICI (resistant) metastasis Small bowel
4 cy78_CD45_neg_2_B08_S596_comb MEL78 Malignant Cell Tirosh 6475 70466 Mel78 73 M WDVAX, ipilimumab + nivolumab Post-ICI (resistant) metastasis Small bowel

EXPRESSION MATRIX QC

In [13]:
df_tpm = pd.read_csv(EXP_MTX_TPM_FNAME, index_col=0)
df_tpm.shape
Out[13]:
(23686, 7186)
In [14]:
#df_counts = pd.read_csv(EXP_MTX_COUNTS_FNAME, index_col=0)
#df_counts.shape
In [15]:
adata = sc.AnnData(X=df_tpm.T.sort_index())
df_obs = df_metadata[['cell_id', 'sample_id', 'cell_type', 'cohort', 'patient_id', 'age', 'sex', 'treatment',
                                                           'treatment_group', 'lesion_type', 'site']].set_index('cell_id').sort_index()
adata.obs = df_obs
adata.var_names_make_unique()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
# Store non-log transformed data as raw. This data can be used via the use_raw parameters available for many functions.
# In the scanpy's tutorials this is used to stored all genes in log-transformed counts before retaining only Highly Variable Genes (HVG). 
# Because in this case no filtering is done we use this feature to store raw counts.
adata.raw = adata 
sc.pp.log1p(adata)
adata
Out[15]:
AnnData object with n_obs × n_vars = 7186 × 22527 
    obs: 'sample_id', 'cell_type', 'cohort', 'patient_id', 'age', 'sex', 'treatment', 'treatment_group', 'lesion_type', 'site', 'n_genes'
    var: 'n_cells'
In [16]:
adata.write_h5ad(ANNDATA_FNAME) # Categorical dtypes are created.
... storing 'sample_id' as categorical
... storing 'cell_type' as categorical
... storing 'cohort' as categorical
... storing 'patient_id' as categorical
... storing 'sex' as categorical
... storing 'treatment' as categorical
... storing 'treatment_group' as categorical
... storing 'lesion_type' as categorical
... storing 'site' as categorical
In [17]:
adata.to_df().to_csv(EXP_MTX_QC_FNAME)

STEP 1: Network inference based on GRNBoost2 from CLI

For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.

Output: List of adjacencies between a TF and its targets stored in ADJACENCIES_FNAME.

!pyscenic grn {EXP_MTX_QC_FNAME} {HUMAN_TFS_FNAME} -o {ADJACENCIES_FNAME} --num_workers 4
2019-04-25 11:22:20,360 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-04-25 11:23:37,612 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
preparing dask client
parsing input
creating dask graph
4 partitions
computing dask graph
not shutting down client, client was created externally
finished
2019-04-26 03:58:01,096 - pyscenic.cli.pyscenic - INFO - Writing results to file.

STEP 2-3: Regulon prediction aka cisTarget from CLI

For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.

Output: List of adjacencies between a TF and its targets stored in MOTIFS_FNAME.

In [18]:
DBS_PARAM = ' '.join(RANKING_DBS_FNAMES)
!pyscenic ctx {ADJACENCIES_FNAME} {DBS_PARAM} \ --annotations_fname {MOTIF_ANNOTATIONS_FNAME} \ --expression_mtx_fname {EXP_MTX_QC_FNAME} \ --output {MOTIFS_FNAME} \ --num_workers 26
2019-04-26 10:23:50,395 - pyscenic.cli.pyscenic - INFO - Creating modules.
2019-04-26 10:23:56,229 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-04-26 10:29:09,101 - pyscenic.cli.pyscenic - INFO - Loading databases.
2019-04-26 10:29:09,101 - pyscenic.cli.pyscenic - INFO - Calculating regulons.
2019-04-26 11:11:10,784 - pyscenic.cli.pyscenic - INFO - Writing results to file

The results is a list of enriched motifs for the modules.

Column name Description
TF Transcription Factor (TF) for which an enriched motif is discovered.
motifID The identifier of the enriched motif.
AUC Area Under the recovery Curve statistic for this enriched motif.
NES Normalized Enrichment Score for this enriched motif.
Context Collection of tags clarifying the origin of the module for this factor: e.g. ranking database, ...
Annotation Verbose description of the annotation available for this motif.
MotifSimilarityQvalue The TomTom derived Q-value for motif similarity (if used for assigning the factor to this enriched motif).
OrthologousIdentity The Amino Acid Identity between factors (if used for assigning the factor to this enriched motif).
RankAtMax The position of the Leading Edge which is used as a threshold on the whole genome ranking of the motif to decide if a gene in the input is a direct target of a TF that binds this motif.
TargetGenes A list of pairs: genes and their associated weights from GENIE3/GRNBoost2.
In [19]:
df_motifs = load_motifs(MOTIFS_FNAME)
In [20]:
df_motifs.head()
Out[20]:
Enrichment
AUC Annotation Context MotifSimilarityQvalue NES OrthologousIdentity RankAtMax TargetGenes
TF MotifID
ASCL1 swissregulon__hs__MYFfamily.p2 0.050424 gene is orthologous to ENSMUSG00000020052 in M... (activating, weight>75.0%, hg19-500bp-upstream... 0.000804 3.078687 0.898305 654 [(CHSY3, 0.46559999032508703), (ZNF668, 0.8184...
ATF1 dbcorrdb__BRCA1__ENCSR000EDB_1__m1 0.041473 gene is annotated for similar motif dbcorrdb__... (activating, weight>75.0%, hg19-500bp-upstream... 0.000112 3.039716 1.000000 3823 [(QRICH1, 0.6030282817865943), (KLHL12, 0.4067...
ATF3 transfac_public__M00039 0.044822 gene is annotated for similar motif transfac_p... (activating, weight>75.0%, hg19-500bp-upstream... 0.000052 3.100943 1.000000 997 [(NR4A3, 0.4479121751322677), (PCSK1, 1.457322...
dbcorrdb__ATF3__ENCSR000BJY_1__m3 0.046101 gene is directly annotated (activating, weight>75.0%, hg19-500bp-upstream... 0.000000 3.364969 1.000000 4598 [(TP53INP2, 0.4479121751322677), (EGR3, 1.4573...
dbcorrdb__CREB1__ENCSR000BRB_1__m1 0.046069 motif similar to dbcorrdb__ATF3__ENCSR000BKC_1... (activating, weight>75.0%, hg19-500bp-upstream... 0.000000 3.358256 1.000000 4629 [(NR4A3, 0.4479121751322677), (EGR1, 1.4573220...

Display the enriched motifs with their associated sequence logos.

In [21]:
display_logos(df_motifs.head())
Enrichment
AUC Annotation Context MotifSimilarityQvalue NES OrthologousIdentity RankAtMax TargetGenes MotifLogo
TF MotifID
ASCL1 swissregulon__hs__MYFfamily.p2 0.050424 gene is orthologous to ENSMUSG00000020052 in M. musculus (identity = 89%) which is annotated for similar motif hocomoco__ASCL1_MOUSE.H11MO.0.A ('ASCL1_MOUSE'; q-value = 0.000804) (activating, weight>75.0%, hg19-500bp-upstream-10species) 0.000804 3.078687 0.898305 654 [(STXBP5, 0.3552535926427597), (CHRNA7, 0.3860923702554104), (FCRLB, 0.3893990879386699)]
ATF1 dbcorrdb__BRCA1__ENCSR000EDB_1__m1 0.041473 gene is annotated for similar motif dbcorrdb__ATF1__ENCSR000DNZ_1__m1 ('ATF1 (ENCSR000DNZ-1, motif 1)'; q-value = 0.000112) (activating, weight>75.0%, hg19-500bp-upstream-10species) 0.000112 3.039716 1.000000 3823 [(RBM27, 0.3524876883570909), (ACAA1, 0.3535980180842792), (SLC39A9, 0.3538371292659718)]
ATF3 transfac_public__M00039 0.044822 gene is annotated for similar motif transfac_pro__M00981 ('V$CREBATF_Q6: CREB, ATF'; q-value = 5.19e-05) (activating, weight>75.0%, hg19-500bp-upstream-10species) 0.000052 3.100943 1.000000 997 [(GEM, 0.3687819202995441), (C6orf52, 0.3767115431313599), (ID4, 0.3778118069916045)]
dbcorrdb__ATF3__ENCSR000BJY_1__m3 0.046101 gene is directly annotated (activating, weight>75.0%, hg19-500bp-upstream-10species) 0.000000 3.364969 1.000000 4598 [(MPP5, 0.35211458906961063), (KLF10, 0.35340280449268463), (PDK4, 0.3539377141941185)]
dbcorrdb__CREB1__ENCSR000BRB_1__m1 0.046069 motif similar to dbcorrdb__ATF3__ENCSR000BKC_1__m2 ('ATF3 (ENCSR000BKC-1, motif 2)'; q-value = 1.01e-09) which is directly annotated (activating, weight>75.0%, hg19-500bp-upstream-10species) 0.000000 3.358256 1.000000 4629 [(ZNF331, 0.35211458906961063), (HS6ST3, 0.35340280449268463), (RWDD2A, 0.3539377141941185)]

STEP 4: Cellular enrichment aka AUCell

REGULON CREATION

Regulons can easily be created from this list of enriched motifs via pyscenic.transform.df2regulons. Here we provide an auxilliary function to carefully select the enriched motifs that contribute to the regulons.

In [22]:
def derive_regulons(motifs, db_names=('hg19-tss-centered-10kb-10species', 
                                 'hg19-500bp-upstream-10species', 
                                 'hg19-tss-centered-5kb-10species')):
    motifs.columns = motifs.columns.droplevel(0)

    def contains(*elems):
        def f(context):
            return any(elem in context for elem in elems)
        return f

    # For the creation of regulons we only keep the 10-species databases and the activating modules. We also remove the
    # enriched motifs for the modules that were created using the method 'weight>50.0%' (because these modules are not part
    # of the default settings of modules_from_adjacencies anymore.
    motifs = motifs[
        np.fromiter(map(compose(op.not_, contains('weight>50.0%')), motifs.Context), dtype=np.bool) & \
        np.fromiter(map(contains(*db_names), motifs.Context), dtype=np.bool) & \
        np.fromiter(map(contains('activating'), motifs.Context), dtype=np.bool)]

    # We build regulons only using enriched motifs with a NES of 3.0 or higher; we take only directly annotated TFs or TF annotated
    # for an orthologous gene into account; and we only keep regulons with at least 10 genes.
    regulons = list(filter(lambda r: len(r) >= 10, df2regulons(motifs[(motifs['NES'] >= 3.0) 
                                                                      & ((motifs['Annotation'] == 'gene is directly annotated')
                                                                        | (motifs['Annotation'].str.startswith('gene is orthologous to')
                                                                           & motifs['Annotation'].str.endswith('which is directly annotated for motif')))
                                                                     ])))
    
    # Rename regulons, i.e. remove suffix.
    return list(map(lambda r: r.rename(r.transcription_factor), regulons))
In [23]:
regulons = derive_regulons(df_motifs)
In [24]:
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
    pickle.dump(regulons, f)

AUCELL

%%time auc_mtx = aucell(df_tpm.T, regulons, num_workers=26) auc_mtx.to_csv(AUCELL_MTX_FNAME)
CPU times: user 23.1 s, sys: 10.7 s, total: 33.8 s
Wall time: 39.2 s
In [25]:
auc_mtx = pd.read_csv(AUCELL_MTX_FNAME, index_col=0)

OPTIONAL STEP 5 - Regulon activity binarization

bin_mtx, thresholds = binarize(auc_mtx) bin_mtx.to_csv(BIN_MTX_FNAME) thresholds.to_frame().rename(columns={0:'threshold'}).to_csv(THR_FNAME)
100%|██████████| 370/370 [33:21<00:00,  3.39s/it]
In [26]:
bin_mtx = pd.read_csv(BIN_MTX_FNAME, index_col=0)
thresholds = pd.read_csv(THR_FNAME, index_col=0).threshold
In [27]:
fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(8, 4), dpi=100)

plot_binarization(auc_mtx, 'NFKB2', thresholds['NFKB2'], ax=ax1)
plot_binarization(auc_mtx, 'MITF', thresholds['MITF'], ax=ax2)
plot_binarization(auc_mtx, 'FOXP3', thresholds['FOXP3'], ax=ax3)
plot_binarization(auc_mtx, 'PAX5', thresholds['PAX5'], ax=ax4)
plot_binarization(auc_mtx, 'IRF8', thresholds['IRF8'], ax=ax5)
plot_binarization(auc_mtx, 'IRF3', thresholds['IRF3'], ax=ax6)
plot_binarization(auc_mtx, 'MLX', thresholds['MLX'], ax=ax7)
plot_binarization(auc_mtx, 'YY1', thresholds['YY1'], ax=ax8)

plt.tight_layout()
savesvg('hists - GSE115978 - binarization.svg', fig)

Create heatmap with binarized regulon activity.

In [28]:
def palplot(pal, names, colors=None, size=1):
    n = len(pal)
    f, ax = plt.subplots(1, 1, figsize=(n * size, size))
    ax.imshow(np.arange(n).reshape(1, n),
              cmap=mpl.colors.ListedColormap(list(pal)),
              interpolation="nearest", aspect="auto")
    ax.set_xticks(np.arange(n) - .5)
    ax.set_yticks([-.5, .5])
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    colors = n * ['k'] if colors is None else colors
    for idx, (name, color) in enumerate(zip(names, colors)):
        ax.text(0.0+idx, 0.0, name, color=color, horizontalalignment='center', verticalalignment='center')
    return f
In [29]:
N_COLORS = len(adata.obs.cell_type.dtype.categories)
COLORS = [color['color'] for color in mpl.rcParams["axes.prop_cycle"]]
In [30]:
cell_type_color_lut = dict(zip(adata.obs.cell_type.dtype.categories, COLORS))
#cell_type_color_lut = dict(zip(adata.obs.cell_type.dtype.categories, adata.uns['cell_type_colors']))
cell_id2cell_type_lut = df_metadata.set_index('cell_id').cell_type.to_dict()
bw_palette = sns.xkcd_palette(["white", "black"])
In [31]:
sns.set()
sns.set_style("whitegrid")
fig = palplot(bw_palette, ['OFF', 'ON'], ['k', 'w'])
savesvg('legend - GSE115978 - on_off.svg', fig)
In [32]:
sns.set()
sns.set(font_scale=0.8)
fig = palplot(sns.color_palette(COLORS), adata.obs.cell_type.dtype.categories, size=1.0)
savesvg('legend - GSE115978 - cell_type_colors.svg', fig)
In [33]:
sns.set()
sns.set(font_scale=1.0)
sns.set_style("ticks", {"xtick.minor.size": 1, "ytick.minor.size": 0.1})
g = sns.clustermap(bin_mtx.T, 
               col_colors=auc_mtx.index.map(cell_id2cell_type_lut).map(cell_type_color_lut),
               cmap=bw_palette, figsize=(20,20))
g.ax_heatmap.set_xticklabels([])
g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_xlabel('Cells')
g.ax_heatmap.set_ylabel('Regulons')
g.ax_col_colors.set_yticks([0.5])
g.ax_col_colors.set_yticklabels(['Cell Type'])
g.cax.set_visible(False)
g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, 'clustermap - GSE115978.png'), format='png')

Save clustered binarized heatmap to Excel for further inspection.

In [34]:
bin_mtx_clustered = bin_mtx.T.copy()
bin_mtx_clustered.rename(columns=df_annotations.set_index('cell_id')['cell_type'].to_dict(), inplace=True)
bin_mtx_clustered.iloc[g.dendrogram_row.reordered_ind, g.dendrogram_col.reordered_ind].to_excel(os.path.join(RESULTS_FOLDERNAME, 'GSE115978 - Binarized regulon activity.xlsx'))

Generate sequence logos.

In [35]:
def fetch_logo(regulon, base_url = BASE_URL):
    for elem in regulon.context:
        if elem.endswith('.png'):
            return '<img src="{}{}" style="max-height:124px;"></img>'.format(base_url, elem)
    return ""
In [36]:
df_regulons = pd.DataFrame(data=[list(map(op.attrgetter('name'), regulons)),
                                 list(map(len, regulons)),
                                 list(map(fetch_logo, regulons))], index=['name', 'count', 'logo']).T
In [37]:
MAX_COL_WIDTH = pd.get_option('display.max_colwidth')
pd.set_option('display.max_colwidth', -1)
display(HTML(df_regulons.head().to_html(escape=False)))
pd.set_option('display.max_colwidth', MAX_COL_WIDTH)
name count logo
0 AHR 25
1 ALX4 23
2 AR 163
3 ARID3A 49
4 ARNT 10

STEP 6: Non-linear projection and clustering

In this step a scanpy.AnnData object is created containing all metadata and results.

First, we select and retain only the most variable genes in the dataset.

PCA + tSNE PROJECTION

In [38]:
sns.set()
In [39]:
sc.pp.highly_variable_genes(adata)
sc.pl.highly_variable_genes(adata)
adata = adata[:, adata.var['highly_variable']]

Then we apply a linear-dimensional reduction technique (PCA).

In [40]:
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color='CD8A')
Trying to set attribute `.obsm` of view, making a copy.
In [41]:
sc.pl.pca_variance_ratio(adata, log=True)

Followed by a tSNE projection.

In [42]:
sc.tl.tsne(adata)
In [43]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type', 'patient_id'], 
           title=['GSE115978 - SKCM - Cell types', 'GSE115978 - SKCM - Lesion types',
                 'GSE115978 - SKCM - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3, palette=COLORS,
          save=' - GSE115978 - PCA+tSNE.svg')
WARNING: Length of palette colors is smaller than the number of categories (palette length: 10, categories length: 31. Some categories will have the same color.
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE115978 - PCA+tSNE.svg

We capture the non-linear projection based on PCA+tSNE for later storage in the loom file.

In [44]:
embedding_pca_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)

We add all metadata derived from SCENIC to the scanpy.AnnData object.

In [45]:
add_scenic_metadata(adata, auc_mtx, regulons)
adata.write_h5ad(ANNDATA_FNAME)

AUCELL + tSNE PROJECTION

We change the tSNE projection so that it relies on AUCell instead of PCA.

In [46]:
sc.tl.tsne(adata, use_rep='X_aucell')
In [47]:
embedding_aucell_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)
In [48]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type', 'patient_id'], 
           title=['GSE115978 - SKCM - Cell types', 'GSE115978 - SKCM - Lesion types',
                 'GSE115978 - SKCM - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3, palette=COLORS,
          save=' - GSE115978 - AUCell+tSNE.svg')
WARNING: Length of palette colors is smaller than the number of categories (palette length: 10, categories length: 31. Some categories will have the same color.
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE115978 - AUCell+tSNE.svg

CELL TYPE SPECIFIC REGULATORS - Z-SCORE

To find cell type specific regulators we use a Z score (i.e. the average AUCell score for the cells of a give type are standardized using the overall average AUCell scores and its standard deviation).

In [49]:
df_obs = adata.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['cell_type']]
df_results = ((df_scores.groupby(by='cell_type').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))
df_results[(df_results.Z >= 3.0)].sort_values('Z', ascending=False).head()
Out[49]:
cell_type regulon Z
1006 Endothelial Cell SOX7 6.406142
798 Endothelial Cell ERG 6.381740
1000 Endothelial Cell SOX17 6.350996
993 Endothelial Cell SMAD1 5.946043
1341 Fibroblast PRRX2 5.646514
In [50]:
df_heatmap = pd.pivot_table(data=df_results[df_results.Z >= 3.0].sort_values('Z', ascending=False),
                           index='cell_type', columns='regulon', values='Z')
#df_heatmap.drop(index='Myocyte', inplace=True) # We leave out Myocyte because many TFs are highly enriched (becuase of small number of cells).
fig, ax1 = plt.subplots(1, 1, figsize=(10, 8))
sns.heatmap(df_heatmap, ax=ax1, annot=True, fmt=".1f", linewidths=.7, cbar=False, square=True, linecolor='gray', 
            cmap="YlGnBu", annot_kws={"size": 6})
ax1.set_ylabel('')
savesvg('heatmap - GSE115978 - regulons.svg', fig)

CELL TYPE SPECIFIC REGULATORS - RSS

In [51]:
rss = regulon_specificity_scores(auc_mtx, adata.obs.cell_type)
rss.head()
Out[51]:
AHR ALX4 AR ARID3A ARNT ASCL2 ATF1 ATF2 ATF3 ATF4 ... ZNF75A ZNF778 ZNF790 ZNF791 ZNF805 ZNF821 ZNF836 ZSCAN12 ZSCAN21 ZSCAN9
? 0.119115 0.106216 0.127623 0.118305 0.117478 0.103804 0.111839 0.115555 0.120514 0.116006 ... 0.109259 0.106379 0.107645 0.107131 0.125820 0.095525 0.103625 0.128017 0.108031 0.103727
Malignant Cell 0.125245 0.104324 0.155486 0.111086 0.137692 0.104024 0.168358 0.213384 0.230929 0.247346 ... 0.151500 0.135952 0.108120 0.119802 0.148203 0.095611 0.100018 0.149919 0.138836 0.104287
Tcytotoxic Cell 0.197441 0.110162 0.165422 0.161381 0.227076 0.134539 0.208408 0.221141 0.201545 0.204216 ... 0.219096 0.167753 0.147290 0.149115 0.190870 0.154367 0.111887 0.227498 0.176874 0.127315
T Cell 0.143044 0.110061 0.134820 0.136850 0.140186 0.121091 0.144151 0.149977 0.140766 0.144019 ... 0.148411 0.136076 0.131913 0.125166 0.149107 0.107472 0.113069 0.158824 0.138966 0.118626
Thelper Cell 0.156558 0.114367 0.130805 0.136580 0.150097 0.125391 0.158675 0.162357 0.148578 0.154212 ... 0.162559 0.138234 0.143287 0.123008 0.155558 0.103925 0.112435 0.163183 0.148753 0.114099

5 rows × 370 columns

In [52]:
sns.set()
sns.set(style='whitegrid', font_scale=0.8)
fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(8, 6), dpi=100)
plot_rss(rss, 'B Cell', ax=ax1)
ax1.set_xlabel('')
plot_rss(rss, 'T Cell', ax=ax2)
ax2.set_xlabel('')
ax2.set_ylabel('')
plot_rss(rss, 'Thelper Cell', ax=ax3)
ax3.set_xlabel('')
ax3.set_ylabel('')
plot_rss(rss, 'Tcytotoxic Cell', ax=ax4)
ax4.set_xlabel('')
ax4.set_ylabel('')
plot_rss(rss, 'NK Cell', ax=ax5)
plot_rss(rss, 'Fibroblast', ax=ax6)
ax6.set_ylabel('')
plot_rss(rss, 'Macrophage', ax=ax7)
ax7.set_ylabel('')
plot_rss(rss, 'Endothelial Cell', ax=ax8)
ax8.set_ylabel('')
plt.tight_layout()
savesvg('plots - GSE115978 - rss.svg', fig)
In [53]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'PAX5', 'TCF7', 'EOMES', 'TBX21', 'PRRX2', 'MAFB'],
           title=['GSE115978 - SKCM - Cell types', 'PAX5', 'TCF7', 'EOMES', 'TBX21', 'PRRX2', 'MAFB'], ncols=4, use_raw=False,
          save=' - GSE115978 - rss_gene.svg', palette=COLORS, cmap='magma')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE115978 - rss_gene.svg
In [54]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'Regulon(PAX5)', 'Regulon(TCF7)', 'Regulon(EOMES)', 'Regulon(TBX21)', 'Regulon(PRRX2)', 'Regulon(MAFB)', 'Regulon(SOX7)'],
           title=['GSE115978 - SKCM - Cell types', 'Regulon(PAX5)', 'Regulon(TCF7)', 'Regulon(EOMES)', 'Regulon(TBX21)', 'Regulon(PRRX2)', 'Regulon(MAFB)', 'Regulon(SOX7)'], ncols=4, use_raw=False,
          save=' - GSE115978 - rss_regulon.svg', palette=COLORS, cmap='viridis')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE115978 - rss_regulon.svg

SELECTED REGULONS

With the metadata from SCENIC add to the AnnData object, we can create cellular scatterplots for regulon activity. In the example we compare MITF regulon enrichment with the expression of the MITF gene.

In [55]:
sc.pl.tsne(adata, color=['cell_type', 'Regulon(MITF)', 'Regulon(TWIST1)'],
           title=['GSE115978 - SKCM - Cell types', 'Regulon(MITF)', 'Regulon(TWIST1)'], ncols=3, use_raw=False,
          save=' - GSE115978 - selected_regulons1.svg', palette=COLORS, cmap='viridis')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE115978 - selected_regulons1.svg
In [56]:
sc.pl.tsne(adata, color=['cell_type', 'MITF'],
           title=['GSE115978 - SKCM - Cell types', 'MITF'], ncols=3, use_raw=False,
          save=' - GSE115978 - selected_regulons2.svg', palette=COLORS, cmap='magma')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE115978 - selected_regulons2.svg
In [57]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'MYC'], 
           title=['GSE115978 - SKCM - Cell types', 'MYC - Gene'], ncols=3, palette=COLORS, use_raw=False,
          save=' - GSE115978 - AUCell+tSNE - Gene MYC.svg', cmap='magma')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE115978 - AUCell+tSNE - Gene MYC.svg
In [58]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'Regulon(MYC)'], 
           title=['GSE115978 - SKCM - Cell types', 'MYC - Regulon'], ncols=3, palette=COLORS, use_raw=False,
          save=' - GSE115978 - AUCell+tSNE - Regulon MYC.svg', cmap='viridis')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE115978 - AUCell+tSNE - Regulon MYC.svg

AUCELL DISTRIBUTIONS

We can also analyze the distribution of AUCell values via the plots in the scanpy package.

In [59]:
df_results.sort_values('Z', ascending=False).groupby(by='cell_type').head(2)
Out[59]:
cell_type regulon Z
1006 Endothelial Cell SOX7 6.406142
798 Endothelial Cell ERG 6.381740
1341 Fibroblast PRRX2 5.646514
1136 Fibroblast CREB3L1 5.578842
1636 Macrophage MAFB 3.544983
1490 Macrophage ATF5 3.512604
579 B Cell PAX5 2.465981
593 B Cell POU2F2 2.420473
2502 NK Cell TBX21 2.365940
2390 NK Cell MYBL1 1.845234
3418 Thelper Cell FOXP3 1.421662
3016 Tcytotoxic Cell EOMES 1.403587
2012 Malignant Cell MITF 1.355042
1915 Malignant Cell ETV4 1.334729
3207 Tcytotoxic Cell RUNX3 1.259551
3485 Thelper Cell MAF 1.154410
338 ? ZNF329 1.000743
2876 T Cell TCF7 0.875316
227 ? PPARA 0.842881
2677 T Cell FOXP1 0.779080
In [60]:
aucell_adata = sc.AnnData(X=auc_mtx.sort_index())
aucell_adata.obs = df_obs
names = list(map(op.attrgetter('name'), filter(lambda r: r.score > 8.0, regulons)))
sc.pl.stacked_violin(aucell_adata, names, groupby='cell_type',
          save=' - GSE115978 - regulons.svg')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/stacked_violin - GSE115978 - regulons.svg
Out[60]:
[<matplotlib.axes._subplots.AxesSubplot at 0x13fa03ef0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x13f86a320>,
 <matplotlib.axes._subplots.AxesSubplot at 0x13d9653c8>,
 <matplotlib.axes._subplots.AxesSubplot at 0x13fd3e4e0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x144eac7f0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x13fdc9ac8>,
 <matplotlib.axes._subplots.AxesSubplot at 0x13dc39d30>,
 <matplotlib.axes._subplots.AxesSubplot at 0x18be69c50>,
 <matplotlib.axes._subplots.AxesSubplot at 0x13c7fc198>,
 <matplotlib.axes._subplots.AxesSubplot at 0x13c7af080>]

STEP 7: Export to SCope

In [61]:
export2loom(adata.to_df(), regulons, LOOM_FNAME,
            cell_annotations=adata.obs['cell_type'].to_dict(),
            embeddings=OrderedDict([('AUCell + tSNE', embedding_aucell_tsne), ('PCA + tSNE', embedding_pca_tsne)]),
            auc_mtx = auc_mtx,
            tree_structure=(),
            title='{}_{}'.format(TCGA_CODE, DATASET_ID),
            nomenclature="HGNC", auc_thresholds=thresholds,
            compress=True)
Regulon name does not seem to be compatible with SCOPE. It should include a space to allow selection of the TF. 
Please run: 
 regulons = [r.rename(r.name.replace('(+)',' ('+str(len(r))+'g)')) for r in regulons] 
or:
 regulons = [r.rename(r.name.replace('(',' (')) for r in regulons]

GSE103322 - Head and Neck Squamous Cell Carcinoma (HNSC)

Publication: Puram S et al. Single-Cell Transcriptomic Analysis of Primary and Metastatic Tumor Ecosystems in Head and Neck Cancer. Cell 2017 https://dx.doi.org/10.1016/j.cell.2017.10.044

Data set acquisition: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103322

In [98]:
DATASET_ID = "GSE103322"
TCGA_CODE = 'HNSC'

Resources downloaded.

In [99]:
# Downloaded from GEO on 15 JUN 2019.
ALL_FNAME = os.path.join(RESOURCES_FOLDERNAME, "GSE103322_HNSCC_all_data.txt")

Results created.

In [100]:
METADATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.metadata.csv'.format(DATASET_ID))
EXP_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.tpm.csv'.format(DATASET_ID))
EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.qc.tpm.csv'.format(DATASET_ID))
ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}_{}.loom'.format(TCGA_CODE, DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))

STEP 0: Preprocessing

CLEANING

In [65]:
def process_gse103322(fname):
    # Load CSV file
    mtx = pd.read_csv(fname, sep='\t', index_col=0, skiprows=[1,2,3,4,5])
    
    # Extract gene symbol
    mtx.index = list(map(lambda g: g[1:-1], mtx.index))
    
    # Remove duplicate gene symbols (keep last when sorted ascending order according to row sum)
    mtx = mtx.iloc[mtx.sum(axis=1).argsort()]
    mtx = mtx[~mtx.index.duplicated(keep='last')]
    
    return mtx
In [66]:
df_mtx = process_gse103322(ALL_FNAME)
df_mtx.to_csv(EXP_MTX_FNAME, index=True)
In [67]:
cell_annotations = pd.read_csv(ALL_FNAME, sep='\t', index_col=0, low_memory=False).head(5)
cell_type = pd.Series(data=list(map(lambda t: t[0] if t[1] else 'Malignant Cell',
                                    zip(cell_annotations.T['non-cancer cell type'], 
                                        cell_annotations.T['classified as non-cancer cells'].map(float).map(bool)))),
                      index = cell_annotations.columns, name='cell_type').replace({'-Fibroblast': 'Fibroblast',
                  'myocyte': 'Myocyte',
                  'Endothelial': 'Endothelial Cell',
                  'Dendritic': 'Dendritic Cell',
                  'Mast': 'Mast Cell',
                  'T cell': 'T Cell',
                  'B cell': 'B Cell'})
lesion_type = pd.Series(data=list(map(lambda t: 'Lymph node' if t else 'Primary tumor',
                                    cell_annotations.T['Lymph node'].map(float).map(bool))),
                      index = cell_annotations.columns, name='lesion_type')
def clean_pid(s: str) -> str:
    m = re.match("HN(SCC)?_?(\d{1,2})_", s)
    if m:
        return 'MEEI{}'.format(int(m.group(2)))
    else:
        return 'MEEI9'
patient_id = pd.Series(data=cell_annotations.columns.map(clean_pid), index = cell_annotations.columns, name='patient_id')
df_metadata = pd.concat([cell_type, lesion_type, patient_id], axis=1).reset_index().rename(columns={'index': 'cell_id'})
df_metadata.to_csv(METADATA_FNAME, sep=',')
df_metadata.head()
Out[67]:
cell_id cell_type lesion_type patient_id
0 HN28_P15_D06_S330_comb Fibroblast Lymph node MEEI28
1 HN28_P6_G05_S173_comb Fibroblast Primary tumor MEEI28
2 HN26_P14_D11_S239_comb Malignant Cell Lymph node MEEI26
3 HN26_P14_H05_S281_comb Fibroblast Lymph node MEEI26
4 HN26_P25_H09_S189_comb Malignant Cell Lymph node MEEI26

EXPRESSION MATRIX QC

In [68]:
adata = sc.AnnData(X=df_mtx.T.sort_index())
df_obs = df_metadata.set_index('cell_id').sort_index()
adata.obs = df_obs
adata.var_names_make_unique()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata.raw = adata #Store non-log transformed data as raw. This data can be used via the use_raw parameters available for many functions.
sc.pp.log1p(adata)
adata
Out[68]:
AnnData object with n_obs × n_vars = 5902 × 21519 
    obs: 'cell_type', 'lesion_type', 'patient_id', 'n_genes'
    var: 'n_cells'
In [69]:
adata.to_df().to_csv(EXP_MTX_QC_FNAME)

STEP 1: Network inference based on GRNBoost2 from CLI

For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.

Output: List of adjacencies between a TF and its targets stored in ADJACENCIES_FNAME.

!pyscenic grn {EXP_MTX_QC_FNAME} {HUMAN_TFS_FNAME} -o {ADJACENCIES_FNAME} --num_workers 32
2019-04-25 09:07:25,605 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-04-25 09:08:25,870 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
preparing dask client
parsing input
creating dask graph
32 partitions
computing dask graph
not shutting down client, client was created externally
finished
2019-04-25 10:55:14,802 - pyscenic.cli.pyscenic - INFO - Writing results to file.

STEP 2-3: Regulon prediction aka cisTarget from CLI

For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.

Output: List of adjacencies between a TF and its targets stored in MOTIFS_FNAME.

!pyscenic ctx {ADJACENCIES_FNAME} {DBS_PARAM} \ --annotations_fname {MOTIF_ANNOTATIONS_FNAME} \ --expression_mtx_fname {EXP_MTX_QC_FNAME} \ --output {MOTIFS_FNAME} \ --num_workers 32
2019-04-26 09:44:55,314 - pyscenic.cli.pyscenic - INFO - Creating modules.
2019-04-26 09:45:00,335 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-04-26 09:49:37,344 - pyscenic.cli.pyscenic - INFO - Loading databases.
2019-04-26 09:49:37,345 - pyscenic.cli.pyscenic - INFO - Calculating regulons.
2019-04-26 10:22:54,812 - pyscenic.cli.pyscenic - INFO - Writing results to file.
In [70]:
df_motifs = load_motifs(MOTIFS_FNAME)

STEP 4: Cellular enrichment aka AUCell

REGULON CREATION

In [71]:
regulons = derive_regulons(df_motifs)
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
    pickle.dump(regulons, f)

AUCELL

%%time auc_mtx = aucell(df_mtx.T, regulons, num_workers=32) auc_mtx.to_csv(AUCELL_MTX_FNAME)

` CPU times: user 18.2 s, sys: 8.92 s, total: 27.2 s Wall time: 30.7 s

In [72]:
auc_mtx = pd.read_csv(AUCELL_MTX_FNAME, index_col=0)

STEP 6: Non-linear projection and clustering

In this step a scanpy.AnnData object is created containing all metadata and results.

First, we select and retain only the most variable genes in the dataset.

In [73]:
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var['highly_variable']]

Then we apply a linear-dimensional reduction technique (PCA).

In [74]:
sc.tl.pca(adata, svd_solver='arpack')
Trying to set attribute `.obsm` of view, making a copy.

Followed by a tSNE projection.

In [75]:
sc.tl.tsne(adata)
In [76]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type','patient_id'], 
           title=['GSE103322 - HNSC - Cell types', 'GSE103322 - HNSC - Lesion types',
                 'GSE103322 - HNSC - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3,
          save=' - GSE103322 - PCA+tSNE.svg')
... storing 'cell_type' as categorical
... storing 'lesion_type' as categorical
... storing 'patient_id' as categorical
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE103322 - PCA+tSNE.svg

We capture the non-linear projection based on PCA+tSNE for later storage in the loom file.

In [77]:
embedding_pca_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)

We add all metadata derived from SCENIC to the scanpy.AnnData object.

In [78]:
add_scenic_metadata(adata, auc_mtx, regulons)
adata.write_h5ad(ANNDATA_FNAME)

We change the tSNE projection so that it relies on AUCell instead of PCA.

In [79]:
sc.tl.tsne(adata, use_rep='X_aucell')
In [80]:
embedding_aucell_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)
In [81]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type','patient_id'], 
           title=['GSE103322 - HNSC - Cell types', 'GSE103322 - HNSC - Lesion types',
                 'GSE103322 - HNSC - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3,
          save=' - GSE103322 - AUCell+tSNE.svg')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE103322 - AUCell+tSNE.svg

CELL TYPE SPECIFIC REGULATORS - RSS

In [82]:
rss = regulon_specificity_scores(auc_mtx, adata.obs.cell_type)
rss.head()
Out[82]:
ALX4 ARID5A ATF3 ATF4 ATF5 ATF6 BACH1 BHLHE22 BHLHE40 BHLHE41 ... ZNF782 ZNF841 ZNF845 ZNF860 ZNF90 ZSCAN16 ZSCAN25 ZSCAN26 ZSCAN31 ZXDA
T Cell 0.105765 0.266315 0.180052 0.187602 0.209422 0.189920 0.202286 0.136789 0.173929 0.172628 ... 0.182813 0.157230 0.106997 0.100908 0.138456 0.176075 0.185942 0.193140 0.149209 0.189537
Fibroblast 0.137145 0.210495 0.264651 0.226924 0.155427 0.213918 0.188293 0.118381 0.239243 0.161765 ... 0.128817 0.137113 0.133661 0.106544 0.144966 0.175647 0.174031 0.163300 0.151327 0.139194
Macrophage 0.087828 0.103199 0.101410 0.099680 0.128901 0.103953 0.094299 0.111822 0.105420 0.141455 ... 0.092079 0.089041 0.088720 0.087556 0.090825 0.097310 0.101442 0.095248 0.093770 0.110627
B Cell 0.089182 0.116266 0.105917 0.108869 0.132251 0.115603 0.111066 0.089307 0.099510 0.134946 ... 0.094093 0.100328 0.098470 0.089679 0.099794 0.099613 0.104919 0.108197 0.095932 0.099406
Malignant Cell 0.097324 0.166367 0.235926 0.277860 0.143060 0.279980 0.177926 0.094829 0.224185 0.135617 ... 0.095654 0.100051 0.090538 0.093358 0.179869 0.297720 0.173363 0.122495 0.309663 0.114854

5 rows × 349 columns

In [83]:
sns.set()
sns.set(style='whitegrid', font_scale=0.8)
fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(8, 6), dpi=100)
plot_rss(rss, 'B Cell', ax=ax1)
ax1.set_xlabel('')
plot_rss(rss, 'T Cell', ax=ax2)
ax2.set_xlabel('')
ax2.set_ylabel('')
plot_rss(rss, 'Mast Cell', ax=ax3)
ax3.set_xlabel('')
ax3.set_ylabel('')
plot_rss(rss, 'Dendritic Cell', ax=ax4)
ax4.set_xlabel('')
ax4.set_ylabel('')
plot_rss(rss, 'Myocyte', ax=ax5)
plot_rss(rss, 'Fibroblast', ax=ax6)
ax6.set_ylabel('')
plot_rss(rss, 'Macrophage', ax=ax7)
ax7.set_ylabel('')
plot_rss(rss, 'Endothelial Cell', ax=ax8)
ax8.set_ylabel('')
plt.tight_layout()
savesvg('plots - GSE103322 - rss.svg', fig)

CELL TYPE SPECIFIC REGULATORS - Z-SCORE

To find cell type specific regulators we use a Z score (i.e. the average AUCell score for the cells of a give type are standardized using the overall average AUCell scores and its standard deviation).

In [84]:
df_obs = adata.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['cell_type']]
df_results = ((df_scores.groupby(by='cell_type').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))
df_results[(df_results.Z >= 3.0)].sort_values('Z', ascending=False).head()
Out[84]:
cell_type regulon Z
2650 Myocyte PPARA 14.056566
2611 Myocyte MYOG 13.692796
2610 Myocyte MYOD1 13.162975
2609 Myocyte MYF6 12.873788
2674 Myocyte SNAI3 11.989173
In [85]:
df_heatmap = pd.pivot_table(data=df_results[df_results.Z >= 3.0].sort_values('Z', ascending=False),
                           index='cell_type', columns='regulon', values='Z')
#df_heatmap.drop(index='Myocyte', inplace=True) # We leave out Myocyte because many TFs are highly enriched (becuase of small number of cells).
fig, ax1 = plt.subplots(1, 1, figsize=(10, 8))
sns.heatmap(df_heatmap, ax=ax1, annot=True, fmt=".1f", linewidths=.7, cbar=False, square=True, linecolor='gray', 
            cmap="YlGnBu", annot_kws={"size": 6})
ax1.set_ylabel('')
savesvg('heatmap - GSE103322 - regulons.svg', fig)
In [86]:
sc.pl.tsne(adata, color=['cell_type', 'Regulon(NFE2)', 'Regulon(XBP1)'],
           title=['GSE103322 - HNSC - Cell types', 'NFE2', 'XBP1'], ncols=3, use_raw=False,
          save=' - GSE103322 - regulons.svg')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE103322 - regulons.svg

SELECTED REGULONS

In [87]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'MYC'], 
           title=['GSE103322 - HNSC - Cell types', 'MYC - Gene'], ncols=3, palette=COLORS, use_raw=False,
          save=' - GSE103322 - AUCell+tSNE - Gene MYC.svg', cmap='magma')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE103322 - AUCell+tSNE - Gene MYC.svg
In [88]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'Regulon(MYC)'], 
           title=['GSE103322 - HNSC - Cell types', 'MYC - Regulon'], ncols=3, palette=COLORS, use_raw=False,
          save=' - GSE103322 - AUCell+tSNE - Regulon MYC.svg', cmap='viridis')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - GSE103322 - AUCell+tSNE - Regulon MYC.svg

DISCOVERY OF MORE FINE-GRAINED CELL TYPES AND THEIR REGULATORS

In this notebook we'll subcluster the fibroblasts and find regulators that define these subtypes of fibroblasts. More specifically, we create a nearest neighbour graph on the AUCell-based dimensional reduced cell space. Louvain community detection is used on the resulting graph to clusters cells.

In [89]:
sc.pp.neighbors(adata, use_rep='X_aucell', n_neighbors=3)
sc.tl.louvain(adata)
In [90]:
counter = 1

def init(data, col_name: str = 'cellular_phenotype'):
    data.obs[col_name] = data.obs['cell_type'].astype(str)
    return data

def subcluster(cell_type: str, abbr: str, data, min_cells: int = 20, col_name: str = 'cellular_phenotype'):
    global counter
    counter = 1
    ct_idx = data.obs[data.obs.cell_type == cell_type].index
    df_cell_numbers = data[ct_idx, :].obs.louvain.value_counts()
    clusters = df_cell_numbers[df_cell_numbers >= min_cells].index
    def rename(n: str) -> str:
        global counter
        if n in clusters:
            res = '{}{}'.format(abbr, counter)
            counter += 1
        else:
            res = '?'
        return res
    data.obs.loc[ct_idx, [col_name]] = data[ct_idx, :].obs['louvain'].map(rename).values
    return data
In [91]:
adata = init(adata)
adata = subcluster('Fibroblast', 'F', adata)
In [92]:
adata.write_h5ad(ANNDATA_FNAME)
... storing 'cellular_phenotype' as categorical
In [93]:
adata.obs.cellular_phenotype.dtype.categories
Out[93]:
Index(['?', 'B Cell', 'Dendritic Cell', 'Endothelial Cell', 'F1', 'F2', 'F3',
       'F4', 'Macrophage', 'Malignant Cell', 'Mast Cell', 'Myocyte', 'T Cell'],
      dtype='object')
In [94]:
N_COLORS = len(adata.obs.cellular_phenotype.dtype.categories)
COLORS = [color['color'] for color in mpl.rcParams["axes.prop_cycle"]]
sc.pl.tsne(adata, color=['cellular_phenotype'],
           title=['GSE103322 - HNSC - Cell types'], ncols=3, use_raw=False, palette=sns.color_palette("Set3"), legend_loc='on data')
WARNING: Length of palette colors is smaller than the number of categories (palette length: 12, categories length: 13. Some categories will have the same color.

We use RSS to identify regulators for these different types of fibroblasts.

In [95]:
rss = regulon_specificity_scores(auc_mtx, adata.obs.cellular_phenotype)
rss.head()
Out[95]:
ALX4 ARID5A ATF3 ATF4 ATF5 ATF6 BACH1 BHLHE22 BHLHE40 BHLHE41 ... ZNF782 ZNF841 ZNF845 ZNF860 ZNF90 ZSCAN16 ZSCAN25 ZSCAN26 ZSCAN31 ZXDA
T Cell 0.105765 0.266315 0.180052 0.187602 0.209422 0.189920 0.202286 0.136789 0.173929 0.172628 ... 0.182813 0.157230 0.106997 0.100908 0.138456 0.176075 0.185942 0.193140 0.149209 0.189537
F1 0.111585 0.170120 0.200347 0.171284 0.124413 0.159399 0.157533 0.113141 0.192056 0.123248 ... 0.119425 0.132125 0.125211 0.097206 0.131603 0.143259 0.135203 0.145601 0.122982 0.122469
Macrophage 0.087828 0.103199 0.101410 0.099680 0.128901 0.103953 0.094299 0.111822 0.105420 0.141455 ... 0.092079 0.089041 0.088720 0.087556 0.090825 0.097310 0.101442 0.095248 0.093770 0.110627
B Cell 0.089182 0.116266 0.105917 0.108869 0.132251 0.115603 0.111066 0.089307 0.099510 0.134946 ... 0.094093 0.100328 0.098470 0.089679 0.099794 0.099613 0.104919 0.108197 0.095932 0.099406
Malignant Cell 0.097324 0.166367 0.235926 0.277860 0.143060 0.279980 0.177926 0.094829 0.224185 0.135617 ... 0.095654 0.100051 0.090538 0.093358 0.179869 0.297720 0.173363 0.122495 0.309663 0.114854

5 rows × 349 columns

In [96]:
sns.set()
sns.set(style='whitegrid', font_scale=0.8)
fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(1, 4, figsize=(8, 3), dpi=100)
plot_rss(rss, 'F1', ax=ax1)
plot_rss(rss, 'F2', ax=ax2)
ax2.set_ylabel('')
plot_rss(rss, 'F3', ax=ax3)
ax3.set_ylabel('')
plot_rss(rss, 'F4', ax=ax4)
ax3.set_ylabel('')
savesvg('plots - GSE103322 - rss - fibroblasts.svg', fig)

STEP 7: Export to SCope

In [104]:
bin_mtx, thresholds = binarize(auc_mtx)
bin_mtx.to_csv(BIN_MTX_FNAME)
thresholds.to_frame().rename(columns={0:'threshold'}).to_csv(THR_FNAME)
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
In [105]:
bin_mtx = pd.read_csv(BIN_MTX_FNAME, index_col=0)
thresholds = pd.read_csv(THR_FNAME, index_col=0).threshold
In [106]:
export2loom(adata.to_df(), regulons, LOOM_FNAME,
            cell_annotations=adata.obs['cell_type'].to_dict(),
            embeddings=OrderedDict([('AUCell + tSNE', embedding_aucell_tsne), ('PCA + tSNE', embedding_pca_tsne)]),
            auc_mtx = auc_mtx,
            tree_structure=(),
            title='{}_{}'.format(TCGA_CODE, DATASET_ID),
            nomenclature="HGNC", auc_thresholds=thresholds,
            compress=True)
Regulon name does not seem to be compatible with SCOPE. It should include a space to allow selection of the TF. 
Please run: 
 regulons = [r.rename(r.name.replace('(+)',' ('+str(len(r))+'g)')) for r in regulons] 
or:
 regulons = [r.rename(r.name.replace('(',' (')) for r in regulons]

E-MTAB-6149/E-MTAB-6653 - Lung Squamous Cell Carcinoma and Lung Adenocarcinoma (LUSC & LUAD)

Publication: Lambrechts D et al. Phenotype molding of stromal cells in the lung tumor microenvironment. Nature Medicine 2018

https://dx.doi.org/10.1038/s41591-018-0096-5

Data set acquisition:

https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-6149/

https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-6653/

In [107]:
DATASET_ID = "E-MTAB-6149_6653"
TCGA_CODE = 'LUAD_LUSC'

Resources downloaded.

In [108]:
# Directly provided by the author.
# The input matrix was the normalized expression matrix, output from Seurat, from which 9,919 genes passed the filtering 
# (sum of expression > 3 × 0.005 × 52,698 and detected in at least 0.5% of the cells).
ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'b_thienpont.clusters.txt')
EXP_MTX_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'E-MTAB-6149_6653.expr.tsv')

Results created.

In [109]:
METADATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.metadata.csv'.format(DATASET_ID))
EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.qc.expr.csv'.format(DATASET_ID))
ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}_{}.loom'.format(TCGA_CODE, DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))

METADATA CLEANING

In [110]:
annotations = pd.read_csv(ANNOTATIONS_FNAME, sep=' ', index_col=0)
annotations['lesion_type'] = ['LC' if row['CellFromTumor'] else 'NL' for _, row in annotations.iterrows()]
annotations = annotations[['cell', 'CellType', 'Sample', 'Patientnumber', 'lesion_type']].rename(columns={
    'cell': 'cell_id',
    'Sample': 'sample_id',
    'Patientnumber': 'patient_id',
    'CellType': 'cell_type'}).set_index('cell_id')
annotations['cell_type'] = annotations.cell_type.replace({'T_cell': 'T cell',
                               'B_cell': 'B cell',
                               'Fibro': 'Fibroblast',
                               'tumor': 'Malignant cell',
                               'Epi': 'Epithelial cell', 
                               'EC': 'Endothelial cell',
                               'Alveolar': 'Alveolar cell',
                               'Myeloid': 'Meyloid cell'})
annotations['patient_id'] = annotations['patient_id'].map(str)
annotations['sample_id'] = annotations['sample_id'].map(str)
annotations.to_csv(METADATA_FNAME, sep=',')
annotations.head()
Out[110]:
cell_type sample_id patient_id lesion_type
cell_id
AGTTCTACGCATAC_1 Alveolar cell 1 1 LC
ATACCGGATCGTGA_1 Alveolar cell 1 1 LC
ATATACGATGCCCT_1 Alveolar cell 1 1 LC
GAACGTTGATGGTC_1 Alveolar cell 1 1 LC
TCAGACGAGCAAGG_1 Alveolar cell 1 1 LC

EXPRESSION MATRIX QC

In [111]:
df_mtx = pd.read_csv(EXP_MTX_FNAME, sep='\t', index_col=0)
In [112]:
adata = sc.AnnData(X=df_mtx.T.sort_index())
adata.obs = annotations.sort_index()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata.var_names_make_unique()
adata.raw = adata #Store non-log transformed data as raw. This data can be used via the use_raw parameters available for many functions.
sc.pp.log1p(adata)
adata
Out[112]:
AnnData object with n_obs × n_vars = 51628 × 9919 
    obs: 'cell_type', 'sample_id', 'patient_id', 'lesion_type', 'n_genes'
    var: 'n_cells'
In [239]:
adata.to_df().to_csv(EXP_MTX_QC_FNAME)

STEP 1: Network inference based on GRNBoost2 from CLI

For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.

Output: List of adjacencies between a TF and its targets stored in ADJACENCIES_FNAME.

!pyscenic grn {EXP_MTX_QC_FNAME} {HUMAN_TFS_FNAME} -o {ADJACENCIES_FNAME} --num_workers 8
2019-05-02 21:24:43,654 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-05-02 21:27:06,425 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
preparing dask client
parsing input
creating dask graph
8 partitions
computing dask graph
finished
2019-05-03 17:38:49,059 - pyscenic.cli.pyscenic - INFO - Writing results to file.

STEP 2-3: Regulon prediction aka cisTarget from CLI

For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.

Output: List of adjacencies between a TF and its targets stored in MOTIFS_FNAME.

!pyscenic ctx {ADJACENCIES_FNAME} {DBS_PARAM} \ --annotations_fname {MOTIF_ANNOTATIONS_FNAME} \ --expression_mtx_fname {EXP_MTX_QC_FNAME} \ --output {MOTIFS_FNAME} \ --num_workers 18
2019-05-06 20:50:20,880 - pyscenic.cli.pyscenic - INFO - Creating modules.
2019-05-06 20:50:23,528 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-05-06 20:55:30,410 - pyscenic.cli.pyscenic - INFO - Loading databases.
2019-05-06 20:55:30,410 - pyscenic.cli.pyscenic - INFO - Calculating regulons.
2019-05-06 21:31:19,109 - pyscenic.cli.pyscenic - INFO - Writing results to file.
In [113]:
df_motifs = load_motifs(MOTIFS_FNAME)

STEP 4: Cellular enrichment aka AUCell

REGULON CREATION

In [114]:
regulons = derive_regulons(df_motifs)
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
    pickle.dump(regulons, f)

AUCELL

%%time auc_mtx = aucell(exp_mtx, regulons, num_workers=18) auc_mtx.to_csv(AUCELL_MTX_FNAME)

CPU times: user 54.2 s, sys: 31.2 s, total: 1min 25s
Wall time: 1min 43s
In [115]:
auc_mtx = pd.read_csv(AUCELL_MTX_FNAME, index_col=0)

STEP 6: Non-linear projection and clustering

In this step a scanpy.AnnData object is created containing all metadata and results.

First, we select and retain only the most variable genes in the dataset.

In [116]:
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var['highly_variable']]

Then we apply a linear-dimensional reduction technique (PCA).

In [117]:
sc.tl.pca(adata, svd_solver='arpack')
Trying to set attribute `.obsm` of view, making a copy.

Followed by a tSNE projection.

In [118]:
sc.tl.tsne(adata)
In [119]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type', 'patient_id'], 
           title=['E-MTAB-6149/6653 - LUAD/LUSC - Cell types', 
                  'E-MTAB-6149/6653 - LUAD/LUSC - Lesion types', 
                  'E-MTAB-6149/6653 - LUAD/LUSC - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3,
          save=' - E-MTAB-6149_6653 - PCA+tSNE.svg')
... storing 'cell_type' as categorical
... storing 'sample_id' as categorical
... storing 'patient_id' as categorical
... storing 'lesion_type' as categorical
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - E-MTAB-6149_6653 - PCA+tSNE.svg

We capture the non-linear projection based on PCA+tSNE for later storage in the loom file.

In [120]:
embedding_pca_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)

We add all metadata derived from SCENIC to the scanpy.AnnData object.

In [121]:
add_scenic_metadata(adata, auc_mtx, regulons)
adata.write_h5ad(ANNDATA_FNAME)

We change the tSNE projection so that it relies on AUCell instead of PCA.

In [122]:
sc.tl.tsne(adata, use_rep='X_aucell')
In [123]:
embedding_aucell_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)
In [124]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type', 'patient_id'], 
           title=['E-MTAB-6149/6653 - LUAD/LUSC - Cell types', 
                  'E-MTAB-6149/6653 - LUAD/LUSC - Lesion types', 
                  'E-MTAB-6149/6653 - LUAD/LUSC - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3,
          save=' - E-MTAB-6149_6653 - AUCell+tSNE.svg')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - E-MTAB-6149_6653 - AUCell+tSNE.svg

Display AUCell scores on top of cellular scatterplot for selected regulons.

In [125]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['MYC'], 
           title=['MYC - Gene'], ncols=3, palette=COLORS, use_raw=False,
          save=' - E-MTAB-6149_6653 - AUCell+tSNE - Gene MYC.svg', cmap='magma')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - E-MTAB-6149_6653 - AUCell+tSNE - Gene MYC.svg
In [126]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['Regulon(MYC)'], 
           title=['MYC - Regulon'], ncols=3, palette=COLORS, use_raw=False,
          save=' - E-MTAB-6149_6653 - AUCell+tSNE - Regulon MYC.svg', cmap='viridis')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - E-MTAB-6149_6653 - AUCell+tSNE - Regulon MYC.svg

To find cell type specific regulators we use a Z score (i.e. the average AUCell score for the cells of a give type are standardized using the overall average AUCell scores and its standard deviation).

In [127]:
df_obs = adata.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['cell_type']]
df_results = ((df_scores.groupby(by='cell_type').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))
df_results[(df_results.Z >= 3.0)].sort_values('Z', ascending=False).head()
Out[127]:
cell_type regulon Z
696 Epithelial cell RFX3 9.483804
695 Epithelial cell RFX2 6.800633
623 Epithelial cell GLIS3 5.315495
517 Endothelial cell SOX7 4.738095
108 Alveolar cell NKX2-1 4.621914
In [128]:
df_heatmap = pd.pivot_table(data=df_results[df_results.Z >= 2.5].sort_values('Z', ascending=False),
                           index='cell_type', columns='regulon', values='Z')
fig, ax1 = plt.subplots(1, 1, figsize=(10, 8))
sns.heatmap(df_heatmap, ax=ax1, annot=True, fmt=".1f", linewidths=.7, cbar=False, square=True, linecolor='gray', 
            cmap="YlGnBu", annot_kws={"size": 6})
ax1.set_ylabel('')
savesvg('heatmap - E-MTAB-6149_6653 - regulons.svg', fig)
In [129]:
sc.pl.tsne(adata, color=['cell_type', 'Regulon(RFX3)', 'Regulon(FOXO1)'],
           title=['E-MTAB-6149/6653 - LUAD/LUSC - Cell types', 'RFX3', 'FOXO1'], ncols=3, use_raw=False,
          save=' - E-MTAB-6149_6653 - regulons.svg')
saving figure to file /Users/bramvandesande/Projects/lcb/protocol/figures/tsne - E-MTAB-6149_6653 - regulons.svg

STEP 7: Export to SCope

In [131]:
bin_mtx, thresholds = binarize(auc_mtx)
bin_mtx.to_csv(BIN_MTX_FNAME)
thresholds.to_frame().rename(columns={0:'threshold'}).to_csv(THR_FNAME)
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
/Users/bramvandesande/miniconda3/envs/scenic_protocol/lib/python3.6/site-packages/pyscenic/diptest.py:30: RuntimeWarning: divide by zero encountered in true_divide
  slopes = (work_cdf[1:] - work_cdf[0]) / distances
In [132]:
bin_mtx = pd.read_csv(BIN_MTX_FNAME, index_col=0)
thresholds = pd.read_csv(THR_FNAME, index_col=0).threshold
In [133]:
export2loom(adata.to_df(), regulons, LOOM_FNAME,
            cell_annotations=adata.obs['cell_type'].to_dict(),
            embeddings=OrderedDict([('AUCell + tSNE', embedding_aucell_tsne), ('PCA + tSNE', embedding_pca_tsne)]),
            auc_mtx = auc_mtx,
            tree_structure=(),
            title='{}_{}'.format(TCGA_CODE, DATASET_ID),
            nomenclature="HGNC", auc_thresholds=thresholds,
            compress=True)
Regulon name does not seem to be compatible with SCOPE. It should include a space to allow selection of the TF. 
Please run: 
 regulons = [r.rename(r.name.replace('(+)',' ('+str(len(r))+'g)')) for r in regulons] 
or:
 regulons = [r.rename(r.name.replace('(',' (')) for r in regulons]
In [ ]: